Imports

library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::filter()   masks stats::filter()
## x dplyr::lag()      masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: rpart
## Registered S3 method overwritten by 'spatstat':
##   method     from
##   print.boxx cli
## 
## spatstat 1.64-1       (nickname: 'Help you I can, yes!') 
## For an introduction to spatstat, type 'beginner'
## 
## Note: spatstat version 1.64-1 is out of date by more than 8 months; we recommend upgrading to the latest version.
devtools::load_all("chigcrim/")
## Loading chigcrim
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
## 
##     matches
## The following object is masked from 'package:purrr':
## 
##     is_null
## The following object is masked from 'package:tidyr':
## 
##     matches

Data

For the training data we will use historical data from 2016 - 2018. We will then predict on the data from 2019 to test the model.

# Load training data
crime_2016 <- load_data(2016, strings_as_factors = TRUE)
crime_2017 <- load_data(2017, strings_as_factors = TRUE)
crime_2018 <- load_data(2018, strings_as_factors = TRUE)

# bind_rows faster than rbind
crime_train <- bind_rows(crime_2016, crime_2017, crime_2018)
# Load test data
crime_2019 <- load_data(2019, strings_as_factors = TRUE)

crime_test <- crime_2019

It is then necessary to prepare the data for its’ use within the Generalised Additive Models.

# Extract useful information from the date
# Include week for later use
crime_train %<>%
  mutate(month = factor(month(date)),
         week = factor(week(date)),
         day = factor(day(date)),
         hour = factor(hour(date)),
         yday = yday(date),
         date = date(date)) 

crime_test %<>%
  mutate(month = factor(month(date)),
         week = factor(month(date)),
         day = factor(day(date)),
         hour = factor(hour(date)),
         yday = yday(date),
         date = date(date)) 


# List of features to use in GAM
keep_features <- c("month", "week", "year", "day", "date", "community_area", 
                   "fbi_code", "yday")

# Prep data for use in GAM
# Use all_of(keep_features) to suppress warning message
gam_train <- crime_train %>% select(all_of(keep_features)) %>% na.omit() %>% 
  mutate(day = as.numeric(day))

gam_test <- crime_test %>% select(all_of(keep_features)) %>% na.omit() %>% 
  mutate(day = as.numeric(day))

gam_train
## # A tibble: 806,492 x 8
##    month week   year   day date       community_area fbi_code  yday
##    <fct> <fct> <int> <dbl> <date>              <int> <fct>    <dbl>
##  1 1     1      2016     2 2016-01-02             12 26           2
##  2 1     2      2016    11 2016-01-11              1 06          11
##  3 1     2      2016    11 2016-01-11             38 07          11
##  4 1     2      2016    13 2016-01-13              2 08B         13
##  5 1     1      2016     6 2016-01-06             48 26           6
##  6 1     2      2016    10 2016-01-10             48 26          10
##  7 1     3      2016    15 2016-01-15             19 08B         15
##  8 1     3      2016    18 2016-01-18             74 03          18
##  9 1     2      2016    13 2016-01-13             19 08B         13
## 10 1     3      2016    21 2016-01-21             19 08B         21
## # … with 806,482 more rows

We are interested in predicting the daily number of crimes, each row represents a crime thus we need to sum the rows for each day to get the number of crimes. We can use the count function to achieve this. Later we will look at breaking down the data further to predict how many of each type of crime (by FBI code) will occur in each commmunity area on a given day.

# Quicker than aggregate
train_dat <- gam_train %>% count(yday, year, date) %>% arrange(year, yday)
test_dat <- gam_test %>% count(yday, year, date) %>% arrange(year, yday)

Model creation

We now build the GAMs using the mgcv package.

gam1 <- gam(n ~ s(as.numeric(yday), bs = "cc") + year, data = train_dat, 
            family = "poisson")
plot(train_dat$n, predict(gam1, type = "response"))
abline(a = 0, b = 1, col = "red")

plot(train_dat$date, train_dat$n)
points(train_dat$date, predict(gam1, type = "response"), col = "red")

all_dat <- rbind(train_dat, test_dat)
plot(all_dat$date, all_dat$n, xlab = "Date", ylab = "Daily crimes")
points(all_dat$date, predict(gam1, type = "response", newdata = all_dat), 
       col = c(rep("red", nrow(train_dat)), rep("blue", nrow(test_dat))))
abline(v = date("2019-01-01"), lty = "dashed")

Feature creation

There are several features that may be of interest, whether a day is a weekend or not, the number of crimes over the previous month, the number of crimes on the same day of the previous year.

add_prev_day <- function(dat, start_date) {
    dat$n_pre <- left_join(data.frame(date = dat$date - days(1)), dat, by = "date")$n
    dat$n_pre[is.na(dat$n_pre)] <- 0
    dat$n_pre[dat$date == start_date] <- NA
    return(dat)
}

add_prev_week <- function(dat, start_date) {
    dat$n_pre <- left_join(data.frame(date = dat$date - days(1)), dat, by = "date")$n
    dat$n_pre[is.na(dat$n_pre)] <- 0
    dat$n_pre[dat$date == start_date] <- NA
    return(dat)
}


add_dow <- function(dat) {
  dat$dow <- wday(dat$date)
  return(dat)
}

add_is_fom <- function(dat) {
  dat$is_fom <- mday(dat$date) == 1
  return(dat)
}

add_is_christmas <- function(dat) {
  dat$is_christmas <- format(dat$date, "%d-%m") == "25-12" | 
    format(dat$date, "%d-%m") == "24-12" |
    format(dat$date, "%d-%m") == "26-12"
  return(dat)
}

add_is_NYD <- function(dat) {
  dat$is_NYD <- format(dat$date, "%d-%m") == "01-01" 
  return(dat)
}

add_month <- function(dat) {
  dat$month <- month(dat$date)
  return(dat)
}

add_is_jan <- function(dat) {
  dat$is_jan <- month(dat$date) == 1
  return(dat)
}

add_week <- function(dat) {
  dat$week <- week(dat$date)
  return(dat)
}

train_dat <- add_prev_day(train_dat, "2016-01-01")
train_dat <- add_dow(train_dat)
train_dat <- add_is_fom(train_dat)
train_dat <- add_is_christmas(train_dat)
train_dat <- add_is_NYD(train_dat)
train_dat <- add_month(train_dat)
train_dat <- add_is_jan(train_dat)
train_dat <- add_week(train_dat)
gam2 <- gam(n ~ s(as.numeric(yday), bs = "cc") + n_pre, data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2016-02-01"), date("2016-02-28")))
points(train_dat$date[-1], predict(gam2, type = "response"), col = "red")

plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam2, type = "response"), col = "red")

gam3 <- gam(n ~ s(as.numeric(yday), bs = "cc") + s(as.numeric(dow), bs = "cr", k = 5) + n_pre, data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2016-02-01"), date("2016-02-28")))
points(train_dat$date[-1], predict(gam3, type = "response"), col = "red")

plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam3, type = "response"), col = "red")

gam4 <- gam(n ~ s(as.numeric(yday), bs = "cc") + as.factor(dow) + n_pre, data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2017-12-01"), date("2018-02-28")))
points(train_dat$date[-1], predict(gam4, type = "response"), col = "red")

plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam4, type = "response"), col = "red")

gam5 <- gam(n ~ s(as.numeric(yday), bs = "cc") + as.factor(dow) + n_pre + is_fom + is_christmas + 
              is_jan + is_NYD + as.factor(week), data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2016-02-01"), date("2016-02-28")))
points(train_dat$date[-1], predict(gam5, type = "response"), col = "red")

plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam5, type = "response"), col = "red")

plot(train_dat$date[-1], train_dat$n[-1] - predict(gam5, type = "response"))

Discrete Spatial Data

library(sf)
## Linking to GEOS 3.9.0, GDAL 3.2.0, PROJ 7.2.0
# Load the community areas boundaries
com_bounds <- st_read("data/community_areas.shp", quiet = TRUE)
com_bounds %<>% arrange(as.integer(area_numbe))
com_bounds
## Simple feature collection with 77 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -87.94011 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
## geographic CRS: WGS84(DD)
## First 10 features:
##    area area_num_1 area_numbe comarea comarea_id       community perimeter
## 1     0          1          1       0          0     ROGERS PARK         0
## 2     0          2          2       0          0      WEST RIDGE         0
## 3     0          3          3       0          0          UPTOWN         0
## 4     0          4          4       0          0  LINCOLN SQUARE         0
## 5     0          5          5       0          0    NORTH CENTER         0
## 6     0          6          6       0          0       LAKE VIEW         0
## 7     0          7          7       0          0    LINCOLN PARK         0
## 8     0          8          8       0          0 NEAR NORTH SIDE         0
## 9     0          9          9       0          0     EDISON PARK         0
## 10    0         10         10       0          0    NORWOOD PARK         0
##    shape_area shape_len                       geometry
## 1    51259902  34052.40 MULTIPOLYGON (((-87.65456 4...
## 2    98429095  43020.69 MULTIPOLYGON (((-87.68465 4...
## 3    65095643  46972.79 MULTIPOLYGON (((-87.64102 4...
## 4    71352328  36624.60 MULTIPOLYGON (((-87.67441 4...
## 5    57054168  31391.67 MULTIPOLYGON (((-87.67336 4...
## 6    87214799  51973.10 MULTIPOLYGON (((-87.64102 4...
## 7    88316400  49478.43 MULTIPOLYGON (((-87.63182 4...
## 8    76675896  57293.16 MULTIPOLYGON (((-87.62446 4...
## 9    31636314  25937.23 MULTIPOLYGON (((-87.80676 4...
## 10  121959105  80368.37 MULTIPOLYGON (((-87.78002 4...
# Create a neighbours list
com_nb <- spdep::poly2nb(com_bounds, row.names = com_bounds$community)
names(com_nb) <- attr(com_nb, "region.id")
coords <- st_coordinates(st_centroid(st_geometry(com_bounds)))
## Warning in st_centroid.sfc(st_geometry(com_bounds)): st_centroid does not give
## correct centroids for longitude/latitude data
plot(st_geometry(com_bounds), border = "grey")
plot(com_nb, coords, add = TRUE)

# Construct training data set
spatial_dat <- gam_train %>% 
  mutate(community_area = factor(community_area)) %>%
  count(week, year, community_area) %>%
  arrange(year, week, community_area)
# Train GAM
ctrl <- gam.control(nthreads = 8)
gam_spat <- gam(n ~ s(as.numeric(week), bs = "cc") + 
      s(community_area, bs = "mrf", xt = list(nb = com_nb)),
    data = spatial_dat, family = "poisson", control = ctrl)
summary(gam_spat)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## n ~ s(as.numeric(week), bs = "cc") + s(community_area, bs = "mrf", 
##     xt = list(nb = com_nb))
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 3.830587   0.001607    2384   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df Chi.sq p-value    
## s(as.numeric(week))  7.989      8   7738  <2e-16 ***
## s(community_area)   75.970     76 444841  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.939   Deviance explained = 93.9%
## UBRE = 1.7789  Scale est. = 1         n = 12233
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:chigcrim':
## 
##     last_plot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_data <- spatial_dat %>% filter(year == 2016, week == 1)
plot_data %<>% mutate(fit = predict(gam_spat, newdata = plot_data, type = "response")) %>%
  select(community_area, n, fit)
plot_data <- com_bounds %>% left_join(plot_data, by = c("area_numbe" = "community_area"))

gglayers <- list(ggtitle("Actual Counts of Crime in Chicago"),
                 colorspace::scale_fill_continuous_sequential(5, palette = "Inferno"),
                 theme_void(),
                 guides(fill = guide_colorbar(title = "Crime Count")))
p <- ggplot(data = plot_data) +
  geom_sf(aes(fill = n, text = paste0(community, ": ", n))) +
  gglayers
## Warning: Ignoring unknown aesthetics: text
p %>% ggplotly(tooltip = "text") %>% 
  style(hoverlabel = list(bgcolor = "white"), 
        hoveron = "fill", traces = seq.int(2, length(p$x$data)))
p1 <- ggplot(data = plot_data) +
  geom_sf(aes(fill = fit, text = paste0(community, ": ", fit))) +
  gglayers
## Warning: Ignoring unknown aesthetics: text
p1 %>% ggplotly(tooltip = "text") %>% 
  style(hoverlabel = list(bgcolor = "white"), 
        hoveron = "fill", traces = seq.int(2, length(p$x$data)))